LayerNormFusion

对输入数组沿最后一维或指定维度执行 LayerNorm 归一化,并应用可学习的 gamma 和 beta 参数。

\[\text{dst}_i = \frac{\text{src}_i - \mu}{\sqrt{\sigma^2 + \epsilon}} \cdot \gamma_i + \beta_i \quad \text{where} \quad \mu = \frac{1}{N} \sum_{j=1}^{N} \text{src}_j,\quad \sigma^2 = \frac{1}{N} \sum_{j=1}^{N} (\text{src}_j - \mu)^2\]
输入:
  • src_data - 输入数据地址。

  • gamma_data - 缩放参数 gamma。

  • beta_data - 偏置参数 beta。

  • param_inner_size - 参数内层尺寸。

  • param_outer_size - 参数外层尺寸。

  • norm_inner_size - 归一化内层尺寸。

  • norm_outer_size - 归一化外层尺寸。

  • epsilon - 避免除零的小常数。

  • task_id - 当前核心ID(仅适用于私有存储版本)。

  • thread_num - 核心总数(仅适用于私有存储版本,固定值1)。

  • core_mask - 核掩码(仅适用于共享存储版本)。

输出:
  • dst_data - 输出数据地址。

  • out_mean - 每个归一化单元的均值(可选)。

  • out_variance - 每个归一化单元的方差(可选)。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持fp, int8

  • MT7004 支持hp, fp

共享存储版本:

void fp_layernormfusion_s(float *Input0, float *output, float *gamma_data, float *beta_data, float *out_mean, float *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon, int length, int core_mask)
void hp_layernormfusion_s(half *src_data, half *gamma_data, half *beta_data, half *dst_data, half *out_mean, half *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon, int length, int core_mask)
void i8_layernormfusion_s(int8_t *src_data, int8_t *gamma_data, int8_t *beta_data, int8_t *dst_data, float *out_mean, float *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon, int length, int core_mask)

C调用示例:

 1#include <stdio.h>
 2#include <layernormfusion.h>
 3
 4int main() {
 5    float *src = (float *)0xA0000000;        // 输入在DDR空间
 6    float *gamma = (float *)0xA1000000;
 7    float *beta = (float *)0xA2000000;
 8    float *dst = (float *)0xC0000000;
 9    float *out_mean = (float *)0xD0000000;
10    float *out_variance = (float *)0xD1000000;
11    int param_inner_size = 16;
12    int param_outer_size = 1;
13    int norm_inner_size = 16;
14    int norm_outer_size = 2;
15    float epsilon = 1e-5;
16    int core_mask = 0xff;
17    int length = 1024;
18
19    fp_layernormfusion_s(src, gamma, beta, dst, out_mean, out_variance, param_inner_size, param_outer_size, norm_inner_size, norm_outer_size, epsilon, length, core_mask);
20    return 0;
21}

私有存储版本:

void fp_layernormfusion_p(float *src_data, float *gamma_data, float *beta_data, float *dst_data, float *out_mean, float *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon)
void hp_layernormfusion_p(half *src_data, half *gamma_data, half *beta_data, half *dst_data, half *out_mean, half *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon)
void i8_layernormfusion_p(int8_t *src_data, int8_t *gamma_data, int8_t *beta_data, int8_t *dst_data, float *out_mean, float *out_variance, int param_inner_size, int param_outer_size, int norm_inner_size, int norm_outer_size, float epsilon)

C调用示例:

 1#include <stdio.h>
 2#include <layernormfusion.h>
 3
 4int main() {
 5    float *src = (float *)0x10810000;        // 输入在L2空间
 6    float *gamma = (float *)0x10811000;
 7    float *beta = (float *)0x10812000;
 8    float *dst = (float *)0x10820000;
 9    float *out_mean = (float *)0x10821000;
10    float *out_variance = (float *)0x10822000;
11    int param_inner_size = 16;
12    int param_outer_size = 1;
13    int norm_inner_size = 16;
14    int norm_outer_size = 2;
15    float epsilon = 1e-5;
16
17    fp_layernormfusion_p(src, gamma, beta, dst, out_mean, out_variance, param_inner_size, param_outer_size, norm_inner_size, norm_outer_size, epsilon);
18    return 0;
19}